Given \(M\) meanings to express, and \(W\) wordforms with which to express them, each associated with some probability \(p_i\), how many meanings should each wordform \(w_i\) carry as a function of its normalized probability \(p_i\)?
We also define several helper functions to simplify the data processing pipeline.
First, we need a function that reads in an aggregated lexicon (i.e., across homophonous wordforms), as well as the original lexicon with original entries intact; this function will then identify the corrected number of meanings M per word length, and merge that information with the aggregated lexicon.
read_data = function(agg_path) {
# Read in aggregated data
df_agg = read_csv(agg_path)
# Below, we identify the number of *words* of each word length.
df_total_counts = df_agg %>%
mutate(k_actual = num_homophones + 1) %>%
group_by(num_sylls_est) %>%
summarise(M = sum(k_actual))
# Merge with counts
df_agg = merge(df_agg, df_total_counts, by = "num_sylls_est")
nrow(df_agg)
df_agg
}
We then define a function to normalize the probability of a given wordform relative to the sum of probabilities of wordforms for that length.
normalize_probabilities = function(df) {
### For a given dataframe, normalize probabilities "p" relative to #words of a given elgnth.
## First get probabilities
df = df %>%
mutate(normalized_surprisal = surprisal_lstm/num_phones,
p = 2 ** logprob_lstm)
# Calculate sum(p) for each syllable length bucket.
df_syl_sums = df %>%
group_by(num_sylls_est) %>%
summarise(total_prob = sum(p))
# Merge with aggregated data, and normalize
df = df %>%
left_join(df_syl_sums, on = "num_sylls_est") %>%
mutate(p_normalized = p / total_prob)
df
}
We now define a function to compute the expected number of homophones \(k_i\) for a given wordform \(w_i\). This will also calculate the delta between the real and expected amount.
compute_expected_homophones = function(df) {
## Compute expected homophones "k" of a wordform by multiplying normalized probability
## "p" by the number of meanings "M".
df = df %>%
# Expected number of "successes" (entries), minus 1
mutate(k = p_normalized * M - 1) %>%
mutate(delta = num_homophones - k)
df
}
We also define a function to run the main regression:
run_regression = function(df) {
# Initialize output
out = list()
# Build full model
model_full = lm(data = df,
delta ~ frequency + num_sylls_est + normalized_surprisal)
# Build reduced model
model_reduced = lm(data = df,
delta ~ num_sylls_est + normalized_surprisal)
# Run model comparison
comparison = anova(model_reduced, model_full)
df_comp = broom::tidy(comparison) %>%
na.omit()
# Tidy model output
df_model = broom::tidy(model_full)
# Add to list parameters
out$model = df_model
out$comparison = df_comp
out
}
And several plotting functions:
plot_outcome = function(df_output) {
df_output$model$predictor = fct_recode(
df_output$model$term,
"Number of Syllables" = "num_sylls_est",
"Normalized Surprisal" = "normalized_surprisal",
"Log(Frequency)" = "frequency"
# "Neighborhood Size" = "neighborhood_size"
)
### Plots outcome of regression
g = df_output$model %>%
ggplot(aes(x = predictor,
y = estimate)) +
geom_point() +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = estimate - 2*std.error,
ymax = estimate + 2*std.error),
width=.2,
position=position_dodge(.9)) +
labs(x = "Predictor",
y = "Estimate") +
theme_minimal()
g
}
plot_comparison = function(df) {
# Plots expected vs. actual per each wordform.
g = df %>%
ggplot(aes(x = k,
y = num_homophones)) +
geom_point(alpha = .5) +
scale_x_continuous(limits = c(-1, max(df$k))) +
scale_y_continuous(limits = c(0, max(df$k))) +
geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
labs(x = "Expected number of homophones",
y = "Actual number of homophones") +
theme_minimal()
g
}
plot_bins = function(df, n, column, label) {
# Plots delta ~ frequency bins.
df$binned = as.numeric(ntile(pull(df, column), n = n))
g = df %>%
group_by(binned) %>%
summarise(mean_delta = mean(delta),
se_delta = sd(delta) / sqrt(n())) %>%
ggplot(aes(x = binned,
y = mean_delta)) +
geom_point(stat = "identity", size = .2) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = mean_delta - se_delta,
ymax = mean_delta + se_delta),
width=.2,
position=position_dodge(.9)) +
labs(x = label,
y = "Delta (Real - Expected)") +
geom_smooth() +
theme_minimal()
g
}
plot_contrast = function(df, bins = 20) {
df$frequency_binned = as.numeric(ntile(df$frequency, n = 20))
g = df %>%
mutate(expected = k,
actual = num_homophones) %>%
pivot_longer(c(expected, actual), names_to = "model", values_to = "homophones") %>%
ggplot(aes(x = frequency_binned,
y = homophones,
color = model)) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + sd(x)/sqrt(length(x))},
geom= 'pointrange') +
# position=position_dodge(width=0.95)) +
labs(x = "Binned Frequency",
y = "Number of Homophones") +
theme_bw()
g
}
First, we load and process the data.
## setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/Evolution/homophony_delta/analyses")
df_english = read_data("../data/processed/english/reals/english_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Word = col_character(),
## PhonDISC = col_character(),
## Class = col_character(),
## remove = col_logical(),
## words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_english)
## [1] 35107
## Now normalize probabilities and compute expected homophones per wordform
df_english = df_english %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_english %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 9 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 7706 7706
## 2 2 15247 15247
## 3 3 11379 11379
## 4 4 5316 5316
## 5 5 1694 1694
## 6 6 439 439
## 7 7 95 95
## 8 8 10 10
## 9 10 1 1
We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)
df_english_lemmas = read_delim("../data/frequency/english/lemmas.csv", delim = "\\",
quote = "")
## Parsed with column specification:
## cols(
## PhonDISC = col_character(),
## Cob = col_double()
## )
nrow(df_english_lemmas)
## [1] 52447
df_english_lemmas = df_english_lemmas %>%
mutate(freq_adjusted = Cob + 1) %>%
group_by(PhonDISC) %>%
mutate(total_frequency = sum(freq_adjusted),
relative_frequency = freq_adjusted / total_frequency)
df_entropy = df_english_lemmas %>%
group_by(PhonDISC) %>%
summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
total_frequency = mean(total_frequency))
df_merged_english = df_english %>%
# inner_join(df_entropy, by = "PhonDISC")
inner_join(df_entropy, by = "PhonDISC")
nrow(df_merged_english)
## [1] 28914
df_merged_english$frequency = log10(df_merged_english$total_frequency)
Compare the original n-phone surprisal to the LSTM surprisal:
df_merged_english %>%
ggplot(aes(x = surprisal,
y = surprisal_lstm,
color = frequency)) +
geom_point(alpha = .3) +
geom_smooth(method = "lm") +
theme_bw() +
labs(x = "Surprisal (N-phone model)",
y = "Surprisal (LSTM)") +
facet_wrap(~num_sylls_est)
## `geom_smooth()` using formula 'y ~ x'
Now we can visualize the outcome in a variety of ways:
df_merged_english %>%
plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
And finally, we can run the regression and visualize model coefficients:
output = df_merged_english %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.38 0.163 -26.9 6.10e-157
## 2 frequency -0.319 0.0253 -12.6 1.51e- 36
## 3 num_sylls_est 0.415 0.0299 13.9 1.13e- 43
## 4 normalized_surprisal 0.948 0.0258 36.8 1.38e-289
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28910 420087. 1 2323. 160. 1.51e-36
Directly contrast the relationships:
df_merged_english %>%
plot_contrast(bins = 20)
First, we load and process the data.
df_dutch = read_data("../data/processed/dutch/reals/dutch_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Word = col_character(),
## PhonStrsDISC = col_character(),
## PhonCVBr = col_character(),
## PhonSylBCLX = col_character(),
## PhonStrsStDISC = col_character(),
## PhonStCVBr = col_character(),
## PhonSylStBCLX = col_character(),
## PhonolCLX = col_character(),
## PhonolCPA = col_character(),
## PhonDISC = col_character(),
## remove = col_logical(),
## words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_dutch)
## [1] 65260
## Now normalize probabilities and compute expected homophones per wordform
df_dutch = df_dutch %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_dutch %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 9 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 4672 4672
## 2 2 20588 20588
## 3 3 26420 26420
## 4 4 11338 11338
## 5 5 3394 3394
## 6 6 846 846
## 7 7 189 189
## 8 8 28 28
## 9 9 2 2
We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)
df_dutch_lemmas = read_delim("../data/frequency/dutch/lemmas.csv", delim = "\\")
## Parsed with column specification:
## cols(
## PhonDISC = col_character(),
## Inl = col_double()
## )
nrow(df_dutch_lemmas)
## [1] 123926
df_dutch_lemmas = df_dutch_lemmas %>%
mutate(freq_adjusted = Inl + 1) %>%
group_by(PhonDISC) %>%
mutate(total_frequency = sum(freq_adjusted),
relative_frequency = freq_adjusted / total_frequency)
df_entropy = df_dutch_lemmas %>%
group_by(PhonDISC) %>%
summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
total_frequency = mean(total_frequency))
df_merged_dutch = df_dutch %>%
# inner_join(df_entropy, by = "PhonDISC")
inner_join(df_entropy, by = "PhonDISC")
nrow(df_merged_dutch)
## [1] 63364
df_merged_dutch$frequency = log10(df_merged_dutch$total_frequency)
Now we can visualize the outcome in a variety of ways:
df_merged_dutch %>%
plot_comparison()
df_merged_dutch %>%
plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
And finally, we can run the regression and visualize model coefficients:
output = df_merged_dutch %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.45 0.145 -23.8 3.76e-124
## 2 frequency -1.21 0.0241 -50.3 0
## 3 num_sylls_est 0.237 0.0250 9.48 2.68e- 21
## 4 normalized_surprisal 1.13 0.0277 40.6 0
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63360 2007313. 1 80120. 2529. 0
Directly contrast the relationships:
df_merged_dutch %>%
plot_contrast(bins = 20)
df_german = read_data("../data/processed/german/reals/german_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Word = col_character(),
## PhonDISC = col_character(),
## remove = col_logical(),
## words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_german)
## [1] 50435
## Now normalize probabilities and compute expected homophones per wordform
df_german = df_german %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_german %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 10 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 2454 2454
## 2 2 13181 13181
## 3 3 19429 19429
## 4 4 10983 10983
## 5 5 3971 3971
## 6 6 1240 1240
## 7 7 328 328
## 8 8 72 72
## 9 9 16 16
## 10 10 2 2
We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)
df_german_lemmas = read_delim("../data/frequency/german/lemmas.csv", delim = "\\")
## Parsed with column specification:
## cols(
## PhonDISC = col_character(),
## Mann = col_double()
## )
nrow(df_german_lemmas)
## [1] 51728
df_german_lemmas = df_german_lemmas %>%
mutate(freq_adjusted = Mann + 1) %>%
group_by(PhonDISC) %>%
mutate(total_frequency = sum(freq_adjusted),
relative_frequency = freq_adjusted / total_frequency)
df_entropy = df_german_lemmas %>%
group_by(PhonDISC) %>%
summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
total_frequency = mean(total_frequency))
df_merged_german = df_german %>%
# inner_join(df_entropy, by = "PhonDISC")
inner_join(df_entropy, by = "PhonDISC")
nrow(df_merged_german)
## [1] 46720
df_merged_german$frequency = log10(df_merged_german$total_frequency)
Now we can visualize the outcome in a variety of ways:
df_merged_german %>%
plot_comparison()
df_merged_german %>%
plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
And finally, we can run the regression and visualize model coefficients:
output = df_merged_german %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.37 0.148 -29.5 1.19e-189
## 2 frequency -1.01 0.0307 -33.1 7.03e-237
## 3 num_sylls_est 0.302 0.0232 13.0 1.24e- 38
## 4 normalized_surprisal 1.21 0.0267 45.2 0
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 46716 1142074. 1 26713. 1093. 7.03e-237
Directly contrast the relationships:
df_merged_german %>%
plot_contrast(bins = 20)
df_french = read_data("../data/processed/french/reals/french_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## `1_ortho` = col_character(),
## `2_phon` = col_character(),
## `3_lemme` = col_character(),
## `4_cgram` = col_character(),
## `5_genre` = col_character(),
## `6_nombre` = col_character(),
## `11_infover` = col_character(),
## `17_cvcv` = col_character(),
## `18_p_cvcv` = col_character(),
## `23_syll` = col_character(),
## `25_cv-cv` = col_character(),
## `26_orthrenv` = col_character(),
## `27_phonrenv` = col_character(),
## `28_orthosyll` = col_character(),
## `29_cgramortho` = col_character(),
## `34_morphoder` = col_character(),
## remove = col_logical(),
## words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_french)
## [1] 37278
## Now normalize probabilities and compute expected homophones per wordform
df_french = df_french %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_french %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 9 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 3893 3893
## 2 2 13794 13794
## 3 3 15258 15258
## 4 4 7589 7589
## 5 5 2466 2466
## 6 6 670 670
## 7 7 97 97
## 8 8 12 12
## 9 9 3 3
Read in French lemmas.
df_french_lemmas = read_csv("../data/processed/french/reals/french_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## `1_ortho` = col_character(),
## `2_phon` = col_character(),
## `3_lemme` = col_character(),
## `4_cgram` = col_character(),
## `5_genre` = col_character(),
## `6_nombre` = col_character(),
## `11_infover` = col_character(),
## `17_cvcv` = col_character(),
## `18_p_cvcv` = col_character(),
## `23_syll` = col_character(),
## `25_cv-cv` = col_character(),
## `26_orthrenv` = col_character(),
## `27_phonrenv` = col_character(),
## `28_orthosyll` = col_character(),
## `29_cgramortho` = col_character(),
## `34_morphoder` = col_character(),
## remove = col_logical()
## )
## See spec(...) for full column specifications.
df_french_lemmas$frequency = df_french_lemmas$`10_freqlivres` + 1
nrow(df_french_lemmas)
## [1] 43782
df_french_lemmas = df_french_lemmas %>%
## Multiply by 14.8, b/c measures were divided by 14.8 in Lexique
mutate(freq_adjusted = `8_freqlemlivres` * 14.8) %>%
group_by(`2_phon`) %>%
summarise(total_frequency = sum(frequency))
nrow(df_french_lemmas)
## [1] 37278
df_french_merged = df_french %>%
inner_join(df_french_lemmas, by = "2_phon")
nrow(df_french)
## [1] 37278
nrow(df_french_merged)
## [1] 37278
df_french_merged$frequency = log10(df_french_merged$total_frequency + 1)
And finally, we can run the regression and visualize model coefficients:
output = df_french_merged %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.27 0.151 -28.2 5.30e-173
## 2 frequency -0.885 0.0507 -17.5 5.58e- 68
## 3 num_sylls_est 0.395 0.0223 17.7 9.78e- 70
## 4 normalized_surprisal 0.934 0.0252 37.0 6.10e-295
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 37274 515983. 1 4219. 305. 5.58e-68
Directly contrast the relationships:
df_french_merged %>%
plot_contrast(bins = 20)
df_japanese = read_data("../data/processed/japanese/reals/japanese_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## orth_form_kanji = col_character(),
## orth_form_hiragana = col_character(),
## orth_form_romaji = col_character(),
## phonetic_form = col_character(),
## morph_form = col_character(),
## glosses = col_character(),
## multiple_pronunications = col_logical(),
## phonetic_remapped = col_character(),
## remove = col_logical(),
## words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_japanese)
## [1] 40449
## Now normalize probabilities and compute expected homophones per wordform
df_japanese = df_japanese %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_japanese %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 17 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 1011 1011
## 2 2 7710 7710
## 3 3 14816 14816
## 4 4 14237 14237
## 5 5 6619 6619
## 6 6 3673 3673
## 7 7 1657 1657
## 8 8 733 733
## 9 9 345 345
## 10 10 185 185
## 11 11 72 72
## 12 12 52 52
## 13 13 19 19
## 14 14 8 8
## 15 15 7 7
## 16 16 2 2
## 17 22 1 1
Japanese already has frequency data.
df_japanese_lemmas = read_csv("../data/processed/japanese/reals/japanese_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## `Unnamed: 0` = col_double(),
## orth_form_kanji = col_character(),
## orth_form_hiragana = col_character(),
## orth_form_romaji = col_character(),
## phonetic_form = col_character(),
## morph_form = col_character(),
## frequency = col_double(),
## glosses = col_character(),
## multiple_pronunications = col_logical(),
## phonetic_remapped = col_character(),
## num_phones = col_double(),
## num_sylls_est = col_double(),
## remove = col_logical(),
## num_homophones = col_double(),
## log_prob = col_double(),
## surprisal = col_double()
## )
nrow(df_japanese_lemmas)
## [1] 51147
df_japanese_lemmas = df_japanese_lemmas %>%
group_by(phonetic_remapped) %>%
summarise(total_frequency = mean(frequency))
nrow(df_japanese_lemmas)
## [1] 40449
df_japanese_merged = df_japanese %>%
inner_join(df_japanese_lemmas, by = "phonetic_remapped")
nrow(df_japanese)
## [1] 40449
nrow(df_japanese_merged)
## [1] 40449
df_japanese_merged$frequency = log10(df_japanese_merged$total_frequency + 1)
Now, visualize the relationship.
And finally, we can run the regression and visualize model coefficients:
output = df_japanese_merged %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -4.35 0.155 -28.1 7.63e-172
## 2 frequency -0.783 0.0946 -8.29 1.21e- 16
## 3 num_sylls_est 0.168 0.0166 10.1 5.63e- 24
## 4 normalized_surprisal 1.04 0.0324 32.1 7.45e-224
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 40445 956171. 1 1623. 68.6 1.21e-16
Directly contrast the relationships:
df_japanese_merged %>%
plot_contrast(bins = 20)
Here, we calculate the Homophony Delta for Mandarin Chinese, using the Chinese Lexical Database.
df_mandarin_cld = read_data("../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Word = col_character(),
## C1 = col_character(),
## C2 = col_character(),
## C3 = col_logical(),
## C4 = col_logical(),
## C1Structure = col_character(),
## C2Structure = col_character(),
## C3Structure = col_logical(),
## C4Structure = col_logical(),
## C1Type = col_character(),
## C2Type = col_character(),
## C3Type = col_logical(),
## C4Type = col_logical(),
## Pinyin = col_character(),
## C1Pinyin = col_character(),
## C2Pinyin = col_character(),
## C3Pinyin = col_logical(),
## C4Pinyin = col_logical(),
## IPA = col_character(),
## C1IPA = col_character()
## # ... with 126 more columns
## )
## See spec(...) for full column specifications.
## Warning: 712019 parsing failures.
## row col expected actual file
## 30786 C3 1/0/T/F/TRUE/FALSE 么 '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Structure 1/0/T/F/TRUE/FALSE SG '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Type 1/0/T/F/TRUE/FALSE Other '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Pinyin 1/0/T/F/TRUE/FALSE me5 '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3IPA 1/0/T/F/TRUE/FALSE mɤ5 '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## ..... ........... .................. ...... .................................................................
## See problems(...) for more details.
nrow(df_mandarin_cld)
## [1] 41009
## Now normalize probabilities and compute expected homophones per wordform
df_mandarin_cld = df_mandarin_cld %>%
normalize_probabilities() %>%
compute_expected_homophones()
## Joining, by = "num_sylls_est"
nrow(df_mandarin_cld)
## [1] 41009
## Double-check that this equals correct amount in lexicon
df_mandarin_cld %>%
group_by(num_sylls_est) %>%
summarise(total = sum(k + 1),
correct_total = mean(M))
## # A tibble: 4 × 3
## num_sylls_est total correct_total
## <dbl> <dbl> <dbl>
## 1 1 3648 3648
## 2 2 31637 31637
## 3 3 6930 6930
## 4 4 3337 3337
Mandarin already has frequency data, but we need to make sure it's summed across lemmas.
df_mandarin_lemmas = read_csv("../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Word = col_character(),
## C1 = col_character(),
## C2 = col_character(),
## C3 = col_logical(),
## C4 = col_logical(),
## C1Structure = col_character(),
## C2Structure = col_character(),
## C3Structure = col_logical(),
## C4Structure = col_logical(),
## C1Type = col_character(),
## C2Type = col_character(),
## C3Type = col_logical(),
## C4Type = col_logical(),
## Pinyin = col_character(),
## C1Pinyin = col_character(),
## C2Pinyin = col_character(),
## C3Pinyin = col_logical(),
## C4Pinyin = col_logical(),
## IPA = col_character(),
## C1IPA = col_character()
## # ... with 125 more columns
## )
## See spec(...) for full column specifications.
## Warning: 715162 parsing failures.
## row col expected actual file
## 35286 C3 1/0/T/F/TRUE/FALSE 么 '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Structure 1/0/T/F/TRUE/FALSE SG '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Type 1/0/T/F/TRUE/FALSE Other '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Pinyin 1/0/T/F/TRUE/FALSE me5 '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3IPA 1/0/T/F/TRUE/FALSE mɤ5 '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## ..... ........... .................. ...... ........................................................................
## See problems(...) for more details.
nrow(df_mandarin_lemmas)
## [1] 45552
df_mandarin_lemmas = df_mandarin_lemmas %>%
group_by(phonetic_remapped) %>%
summarise(total_frequency = sum(FrequencyRaw))
nrow(df_mandarin_lemmas)
## [1] 41009
df_mandarin_merged = df_mandarin_cld %>%
inner_join(df_mandarin_lemmas, by = "phonetic_remapped")
nrow(df_mandarin_cld)
## [1] 41009
nrow(df_mandarin_merged)
## [1] 41009
df_mandarin_merged$frequency = log10(df_mandarin_merged$total_frequency)
Now we can visualize the outcome in a variety of ways:
df_mandarin_merged %>%
plot_comparison()
df_mandarin_merged %>%
plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
And finally, we can run the regression and visualize model coefficients:
output = df_mandarin_merged %>%
run_regression()
output %>%
plot_outcome()
output$model
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.99 0.0790 -25.2 3.36e-139
## 2 frequency -0.239 0.0137 -17.5 5.27e- 68
## 3 num_sylls_est -0.00307 0.0191 -0.161 8.72e- 1
## 4 normalized_surprisal 0.884 0.0174 50.7 0
output$comparison
## # A tibble: 1 × 6
## res.df rss df sumsq statistic p.value
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 41005 230776. 1 1715. 305. 5.27e-68
Directly contrast the relationships:
df_mandarin_merged %>%
plot_contrast(bins = 20)
Below, we combine the lexica so that we can visualize the central relationships in the same plot.
df_merged_english_f = df_merged_english %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'English') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_merged_dutch_f = df_merged_dutch %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'Dutch') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_merged_german_f = df_merged_german %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'German') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_french_f = df_french_merged %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'French') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_japanese_f = df_japanese_merged %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'Japanese') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_mandarin_cld_f = df_mandarin_merged %>%
mutate(binned_frequency = ntile(frequency, n = 20)) %>%
mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
mutate(language = 'Mandarin') %>%
select(num_sylls_est, normalized_surprisal, surprisal_lstm,
heldout_surprisal, binned_neighorhood_size,
num_homophones, k, frequency, binned_frequency, delta, language)
df_all_lexica = df_merged_english_f %>%
rbind(df_merged_dutch_f) %>%
rbind(df_merged_german_f) %>%
rbind(df_french_f) %>%
rbind(df_japanese_f) %>%
rbind(df_mandarin_cld_f)
PlotTheme = theme(
axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16),
strip.text.x = element_text(size = 16),
title = element_text(size = 16),
legend.text = element_text(size = 16),
legend.title = element_text(size = 16))
df_all_lexica %>%
ggplot(aes(x = heldout_surprisal,
y = surprisal_lstm)) +
geom_point(alpha = .05) +
geom_smooth(method = "lm") +
theme_bw() +
labs(x = "Surprisal (N-phone)",
y = "Surprisal (LSTM)") +
# scale_x_continuous(limits = c(0, max(df_all_lexica$surprisal_lstm))) +
# scale_y_continuous(limits = c(0, max(df_all_lexica$surprisal_lstm))) +
facet_wrap(~language)
## `geom_smooth()` using formula 'y ~ x'
# ggsave("../Figures/surprisal_comparison.png", dpi = 300)
## Correlation coefficients
df_all_lexica %>%
group_by(language) %>%
summarise(r = cor(heldout_surprisal, surprisal_lstm))
## # A tibble: 6 × 2
## language r
## <chr> <dbl>
## 1 Dutch 0.706
## 2 English 0.722
## 3 French 0.762
## 4 German 0.742
## 5 Japanese 0.913
## 6 Mandarin 0.923
df_all_lexica %>%
group_by(binned_frequency, language) %>%
summarise(mean_delta = mean(delta),
se_delta = sd(delta) / sqrt(n())) %>%
ggplot(aes(x = binned_frequency,
y = mean_delta)) +
geom_point(stat = "identity", size = .2) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = mean_delta - 2 * se_delta,
ymax = mean_delta + 2 *se_delta),
width=.2,
position=position_dodge(.9)) +
labs(x = "Binned Frequency",
y = "Delta (Real - Expected)") +
geom_smooth() +
theme_bw() +
facet_wrap(~language) +
PlotTheme
## `summarise()` has grouped output by 'binned_frequency'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggsave("../Figures/lstm_delta.png", dpi = 300)
df_all_lexica %>%
mutate(Baseline = k,
Real = num_homophones) %>%
pivot_longer(c(Baseline, Real),
names_to = "Lexicon",
values_to = "homophones") %>%
group_by(binned_frequency, language, Lexicon) %>%
summarise(mean_homophones = mean(homophones),
se_homophones = sd(homophones) / sqrt(n())) %>%
ggplot(aes(x = binned_frequency,
y = mean_homophones,
color = Lexicon)) +
geom_point(stat = "identity", size = .5, alpha = .5) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = mean_homophones - 2 * se_homophones,
ymax = mean_homophones + 2 * se_homophones),
width=.2) +
labs(x = "Binned Frequency",
y = "Number of Homophones") +
geom_smooth(alpha = .6) +
theme_bw() +
facet_wrap(~language) +
PlotTheme +
theme(panel.spacing.x = unit(1.5, "lines"))
## `summarise()` has grouped output by 'binned_frequency', 'language'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggsave("../Figures/s1_frequency.png", dpi = 300)
df_all_lexica %>%
mutate(Baseline = k,
Real = num_homophones) %>%
pivot_longer(c(Baseline, Real),
names_to = "Lexicon",
values_to = "homophones") %>%
group_by(binned_neighorhood_size, language, Lexicon) %>%
summarise(mean_homophones = mean(homophones),
se_homophones = sd(homophones) / sqrt(n())) %>%
ggplot(aes(x = binned_neighorhood_size,
y = mean_homophones,
color = Lexicon)) +
geom_point(stat = "identity", size = .5, alpha = .5) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = mean_homophones - 2 * se_homophones,
ymax = mean_homophones + 2 * se_homophones),
width=.2) +
labs(x = "Binned Neighborhood Size",
y = "Number of Homophones") +
geom_smooth(alpha = .6) +
theme_bw() +
facet_wrap(~language) +
PlotTheme +
theme(panel.spacing.x = unit(1.5, "lines"))
## `summarise()` has grouped output by 'binned_neighorhood_size', 'language'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggsave("../Figures/lstm_direct_comparison_neighbors.png", dpi = 300)
return_regression_table = function(df) {
# Build full model
model_full = lm(data = df,
delta ~ frequency + num_sylls_est + normalized_surprisal)
# Tidy model output
df_model = broom::tidy(model_full)
df_model
}
### Get coefficients for each language
english_coefficients = df_merged_english_f %>%
return_regression_table() %>%
mutate(language = 'English')
dutch_coefficients = df_merged_dutch_f %>%
return_regression_table() %>%
mutate(language = 'Dutch')
german_coefficients = df_merged_german_f %>%
return_regression_table() %>%
mutate(language = 'German')
french_coefficients = df_french_f %>%
return_regression_table() %>%
mutate(language = 'French')
japanese_coefficients = df_japanese_f %>%
return_regression_table() %>%
mutate(language = 'Japanese')
mandarin_coefficients = df_mandarin_cld_f %>%
return_regression_table() %>%
mutate(language = 'Mandarin')
# Combine into single dataframe
df_all_coefficients = english_coefficients %>%
rbind(dutch_coefficients) %>%
rbind(german_coefficients) %>%
rbind(french_coefficients) %>%
rbind(japanese_coefficients) %>%
rbind(mandarin_coefficients)
df_all_coefficients$predictor = fct_recode(
df_all_coefficients$term,
"Number of Syllables" = "num_sylls_est",
"Normalized Surprisal" = "normalized_surprisal",
"Log(Frequency)" = "frequency"
)
### Plots outcome of regression
df_all_coefficients %>%
filter(predictor != "(Intercept)") %>%
ggplot(aes(x = language,
y = estimate)) +
geom_point() +
coord_flip() +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_errorbar(aes(ymin = estimate - 2*std.error,
ymax = estimate + 2*std.error),
width=.2,
position=position_dodge(.9)) +
labs(x = "Predictor",
y = "Estimate") +
theme_bw() +
facet_wrap(~predictor) +
PlotTheme
### Save to figure
# ggsave("../Figures/s2_coefficients.png", dpi = 300)